{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# File Loader Test Notebook\n",
    "\n",
    "This notebook tests the file loading functionality of HttpDoc.crawl_urls() with different file formats."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from autocoder.utils.rest import HttpDoc\n",
    "from byzerllm import ByzerLLM,SimpleByzerLLM\n",
    "import os\n",
    "from autocoder.common import AutoCoderArgs\n",
    "api_key = os.getenv(\"MODEL_DOUBAO_TOKEN\")\n",
    "\n",
    "\n",
    "# Initialize HttpDoc with mock LLM\n",
    "llm = SimpleByzerLLM()\n",
    "llm.deploy(\n",
    "                model_path=\"\",\n",
    "                pretrained_model_type=\"saas/openai\",\n",
    "                udf_name=\"deepseek_chat\",\n",
    "                infer_params={\n",
    "                    \"saas.base_url\": \"https://ark.cn-beijing.volces.com\",\n",
    "                    \"saas.api_key\": api_key,\n",
    "                    \"saas.model\": \"ep-20250204215011-vzbsg\",\n",
    "                    \"saas.is_reasoning\": False\n",
    "                }\n",
    "            )\n",
    "args = AutoCoderArgs() \n",
    "args.urls = [\"/Users/allwefantasy/Downloads/测试.docx\"]           \n",
    "http_doc = HttpDoc(args=args, llm=llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-02-05 09:58:04.159\u001b[0m | \u001b[31m\u001b[1mERROR   \u001b[0m | \u001b[36mautocoder.utils.rest\u001b[0m:\u001b[36m_process_local_file\u001b[0m:\u001b[36m81\u001b[0m - \u001b[31m\u001b[1mFailed to process /Users/allwefantasy/Downloads/测试.docx: 'HttpDoc' object has no attribute 'is_binary_file'\u001b[0m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PDF Results:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Traceback (most recent call last):\n",
      "  File \"/Users/allwefantasy/projects/auto-coder/src/autocoder/utils/rest.py\", line 59, in _process_local_file\n",
      "    # 分发到不同 loader\n",
      "AttributeError: 'HttpDoc' object has no attribute 'is_binary_file'\n"
     ]
    }
   ],
   "source": [
    "# Test PDF file\n",
    "pdf_results = http_doc.crawl_urls()\n",
    "print(\"PDF Results:\")\n",
    "for result in pdf_results:\n",
    "    print(f\"File: {result.module_name}\\nContent: {result.source_code[:200]}...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DOCX Results:\n",
      "File: /Users/allwefantasy/Downloads/测试.docx\n",
      "Content: \n",
      "\n",
      "This is a test paragraph.\n",
      "\n",
      "Another paragraph for testing.\n",
      "\n",
      "...\n"
     ]
    }
   ],
   "source": [
    "# Test DOCX file\n",
    "docx_results = http_doc.crawl_urls()\n",
    "print(\"DOCX Results:\")\n",
    "for result in docx_results:\n",
    "    print(f\"File: {result.module_name}\\nContent: {result.source_code[:200]}...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test PPTX file\n",
    "pptx_results = http_doc.crawl_urls(urls=[\"test_files/sample.pptx\"])\n",
    "print(\"PPTX Results:\")\n",
    "for result in pptx_results:\n",
    "    print(f\"File: {result.module_name}\\nContent: {result.source_code[:200]}...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test Excel file\n",
    "excel_results = http_doc.crawl_urls(urls=[\"test_files/sample.xlsx\"])\n",
    "print(\"Excel Results:\")\n",
    "for result in excel_results:\n",
    "    print(f\"File: {result.module_name}\\nContent: {result.source_code[:200]}...\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "byzerllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
